# import the csv with the articles
PP_NYT <- read.csv("/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Data/PP_NYT.csv")
# subset the variables that are meaningful, create new dataset to work with
pp <- subset(PP_NYT, select=c(DATE, TITLE, LENGTH, GRAPHIC, SECTION, BYLINE, DATELINE, TEXT))
names(pp)
## [1] "DATE" "TITLE" "LENGTH" "GRAPHIC" "SECTION" "BYLINE"
## [7] "DATELINE" "TEXT"
# create a new date variable that will represent the article date in YYYY-MM-DD format
date <- as.character(pp$DATE)
betterDates <- as.Date(date, format = "%B %d, %Y")
pp$date.num <- betterDates
# create a variable for just the year each article was published
year = NULL
pp$year <- substr(pp$date.num, 1, 4)
# create a variable for the year and month each article was published
yearmonth = NULL
pp$yearmonth <- substr(pp$date.num, 1, 7)
Now we’re going to look at the polarity of the articles on Planned Parenthood, as it changes over time. Though we’re using this analysis on Planned Parenthood articles, we could really use this on any corpus, to analyze how the polarity of a set of documents have changed over a given variable—be it time, from document to document, by person, and so on.
set.seed(1234)
# load the libraries we will need for this section
library(mallet) # a wrapper around the Java machine learning tool MALLET
## Loading required package: rJava
library(wordcloud) # to visualize wordclouds
## Loading required package: RColorBrewer
# subset the data for 2009 and later, for an initial analysis of the topic models
pp.2010 <- subset(pp, pp$year > 2009)
# we first have to create an 'id' column
pp.2010$id <- rownames(pp.2010)
# remove punctuation
pp.2010$TEXT <- gsub(pattern="[[:punct:]]",replacement=" ",pp.2010$TEXT)
# load data into mallet
mallet.instances <- mallet.import(pp.2010$id, pp.2010$TEXT, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Data/stoplist.csv", FALSE, token.regexp="[\\p{L}']+")
# choose the number of topics to model
n.topics = 10
# create a topic trainer object
topic.model <- MalletLDA(n.topics)
# load the documents
topic.model$loadDocuments(mallet.instances)
# get the vocabulary, and some statistics about word frequencies; after running this code once through, i went back and re-curated the stop word lists, to remove some of the more frequently used words that weren't otherwise caught
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
# examine some of the vocabulary
word.freqs[1:50,]
## words term.freq doc.freq
## 1 weight 3 2
## 2 political 1019 478
## 3 universe 15 13
## 4 shoulders 13 13
## 5 candidates 662 253
## 6 running 315 203
## 7 senate 984 315
## 8 seat 138 69
## 9 long 638 407
## 10 held 354 270
## 11 edward 37 36
## 12 kennedy 117 58
## 13 embarked 8 8
## 14 frenzied 3 3
## 15 day 668 380
## 16 campaigning 43 36
## 17 monday 301 188
## 18 groups 550 302
## 19 sides 138 111
## 20 health 2165 678
## 21 care 1324 563
## 22 debate 938 353
## 23 flooded 10 8
## 24 state 2436 621
## 25 money 831 370
## 26 advertisements 30 26
## 27 ground 161 115
## 28 troops 46 28
## 29 influence 64 55
## 30 outcome 48 43
## 31 frenetic 3 3
## 32 end 501 342
## 33 race 470 241
## 34 originally 26 25
## 35 thought 217 161
## 36 cakewalk 2 2
## 37 martha 27 20
## 38 coakley 24 10
## 39 democratic 729 311
## 40 attorney 144 95
## 41 general 359 223
## 42 massachusetts 226 112
## 43 overwhelmingly 25 22
## 44 polls 210 126
## 45 showed 163 132
## 46 scott 137 103
## 47 brown 175 69
## 48 republican 2627 632
## 49 senator 808 331
## 50 closed 101 78
# the most frequently used words
word.freqs.ordered <- word.freqs[order(-word.freqs$term.freq), ]
head(word.freqs.ordered)
## words term.freq doc.freq
## 304 abortion 4148 640
## 661 women 3961 709
## 282 planned 3268 1285
## 283 parenthood 3173 1276
## 48 republican 2627 632
## 24 state 2436 621
# optimize hyperparameters every 20 iterations, after 50 burn-in iterations
topic.model$setAlphaOptimization(20, 50)
# now train a model, specifying the number of iterations
topic.model$train(100)
# get the probability of topics in documents and the probability of words in topics; by default the functions return word counts, so to get the probabilities we can normalize and add smoothing, in order to ensure that nothing has a probability of exactly 0
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
# what are the top words in topic 5?
mallet.top.words(topic.model, topic.words[4,])
## words weights
## 1 street 0.010344397
## 2 city 0.010193410
## 3 church 0.008457059
## 4 brooklyn 0.007664378
## 5 art 0.005588307
## 6 public 0.004795625
## 7 john 0.004493651
## 8 local 0.004267171
## 9 avenue 0.004191677
## 10 president 0.003927450
# create a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
## [1] "abortion women state law court"
## [2] "life people religious make good"
## [3] "house republican republicans senate bill"
## [4] "street city church brooklyn art"
## [5] "people man day told gun"
## [6] "republican women campaign romney party"
## [7] "planned parenthood health services care"
## [8] "family planned school years parenthood"
## [9] "obama president romney federal government"
## [10] "women sex education school young"
# show the first few document titles with at least .25 of its content devoted to topic 1
head(pp.2010$TITLE[ doc.topics[1,] > 0.25 ],10)
## [1] After Long Decline, Teenage Pregnancy Rate Rises
## [2] To Court Blacks, Foes of Abortion Make Racial Case
## [3] New Spending for a Wider Range of Sex Education
## [4] Paid Notice: Deaths SLOAN, LISA
## [5] The New Abortion Providers
## [6] Planned Parenthood Clinics Are Stripped of Affiliation After Complaints
## [7] A Hidden Minefield at Pregnancy Centers
## [8] Reproductive Choices Women Face
## [9] Planned Parenthood Fires Employee After Video
## [10] Women and Abortion
## 3791 Levels: 'CONSCIENCE' OF CONSERVATIVES GOES ON THE ATTACK ...
# create a vector that has the title of the most representative text for each topic
topics.articles <- rep("", n.topics)
for (i in 1:n.topics) topics.articles[i] <- paste(pp.2010[which.max(doc.topics[i, ]), ]$TITLE)
# weirdly, many of the topics have the same text that is most representative
topics.articles
## [1] "After Long Decline, Teenage Pregnancy Rate Rises"
## [2] "The Candidates, and Supporters From All Over, Push to the Finish Line"
## [3] "From High Jinks to Handcuffs"
## [4] "From High Jinks to Handcuffs"
## [5] "From High Jinks to Handcuffs"
## [6] "As Lender, Giannoulias Impacted Bank Woes"
## [7] "Ruth P. Smith, 102; Abortion-Rights Pioneer"
## [8] "Ruth P. Smith, 102; Abortion-Rights Pioneer"
## [9] "Ruth P. Smith, 102; Abortion-Rights Pioneer"
## [10] "From High Jinks to Handcuffs"
# now let's look at how topics differ across different years?
topic.words.2009 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2009, smoothed=T, normalized=T)
topic.words.2010 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2010, smoothed=T, normalized=T)
topic.words.2011 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2011, smoothed=T, normalized=T)
topic.words.2012 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2012, smoothed=T, normalized=T)
topic.words.2013 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2013, smoothed=T, normalized=T)
topic.words.2014 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2014, smoothed=T, normalized=T)
topic.words.2015 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2015, smoothed=T, normalized=T)
topics.labels.2010 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2010[topic] <- paste(mallet.top.words(topic.model, topic.words.2010[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2010
## [1] "abortion women abortions doctors clinics"
## [2] "life people make control nietzsche"
## [3] "republican house senate senator cruz"
## [4] "street art museum brooklyn city"
## [5] "keefe wetmore people black solondz"
## [6] "campaign political brown conservative democratic"
## [7] "planned parenthood health organization services"
## [8] "family godfrey planned years fellowship"
## [9] "bank president military giannoulias money"
## [10] "sex education abstinence university institute"
topics.labels.2011 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2011[topic] <- paste(mallet.top.words(topic.model, topic.words.2011[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2011
## [1] "abortion women state law abortions"
## [2] "people life vernacchio make good"
## [3] "house republicans republican democrats bill"
## [4] "street weiner city east tanton"
## [5] "keefe man people told asked"
## [6] "republican party voters political campaign"
## [7] "planned parenthood health services money"
## [8] "school planned law parenthood husband"
## [9] "budget obama cuts federal president"
## [10] "sex women education school sexual"
topics.labels.2012 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2012[topic] <- paste(mallet.top.words(topic.model, topic.words.2012[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2012
## [1] "abortion women state law court"
## [2] "religious life catholic people control"
## [3] "house republican republicans senate bill"
## [4] "street city brinker foundation vaughn"
## [5] "people told day man room"
## [6] "romney women republican campaign voters"
## [7] "planned parenthood komen health cancer"
## [8] "children school kimbrough family university"
## [9] "romney obama president administration santorum"
## [10] "women sex students education young"
topics.labels.2013 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2013[topic] <- paste(mallet.top.words(topic.model, topic.words.2013[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2013
## [1] "abortion women state abortions texas"
## [2] "people life time religious gay"
## [3] "bill senate republican house republicans"
## [4] "church brooklyn queens city thompson"
## [5] "people day home told play"
## [6] "republican party campaign voters women"
## [7] "parenthood health planned cancer breast"
## [8] "planned family mother parenthood died"
## [9] "obama president administration government tax"
## [10] "women gilbert sex percent education"
topics.labels.2014 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2014[topic] <- paste(mallet.top.words(topic.model, topic.words.2014[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2014
## [1] "abortion women court law state"
## [2] "religious life book good time"
## [3] "republican house christie republicans senate"
## [4] "sage city art dance ned"
## [5] "dunham people woman day man"
## [6] "women republican voters democratic election"
## [7] "planned parenthood health services care"
## [8] "years planned ny husband children"
## [9] "insurance obama care pay president"
## [10] "women sex school data found"
topics.labels.2015 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2015[topic] <- paste(mallet.top.words(topic.model, topic.words.2015[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2015
## [1] "abortion women law court state"
## [2] "people life make good talk"
## [3] "republican trump republicans house senator"
## [4] "deace street art church city"
## [5] "people gun shooting police colorado"
## [6] "campaign fiorina republican party candidates"
## [7] "planned parenthood tissue health videos"
## [8] "parenthood planned children years family"
## [9] "obama president federal people tax"
## [10] "women sex school university writer"
# vectorize them
t.2010 <- as.vector(topics.labels.2010)
t.2011 <- as.vector(topics.labels.2011)
t.2012 <- as.vector(topics.labels.2012)
t.2013 <- as.vector(topics.labels.2013)
t.2014 <- as.vector(topics.labels.2014)
t.2015 <- as.vector(topics.labels.2015)
# view all the topics as they change over the years
topics.over.time <- cbind(t.2010, t.2011, t.2012, t.2013, t.2014, t.2015)
# look at each topic individually -- the first topic over the years
topics.over.time[1, ]
## t.2010
## "abortion women abortions doctors clinics"
## t.2011
## "abortion women state law abortions"
## t.2012
## "abortion women state law court"
## t.2013
## "abortion women state abortions texas"
## t.2014
## "abortion women court law state"
## t.2015
## "abortion women law court state"
# the second!
topics.over.time[2, ]
## t.2010
## "life people make control nietzsche"
## t.2011
## "people life vernacchio make good"
## t.2012
## "religious life catholic people control"
## t.2013
## "people life time religious gay"
## t.2014
## "religious life book good time"
## t.2015
## "people life make good talk"
# the third
topics.over.time[3, ]
## t.2010
## "republican house senate senator cruz"
## t.2011
## "house republicans republican democrats bill"
## t.2012
## "house republican republicans senate bill"
## t.2013
## "bill senate republican house republicans"
## t.2014
## "republican house christie republicans senate"
## t.2015
## "republican trump republicans house senator"
# the fourth
topics.over.time[4, ]
## t.2010
## "street art museum brooklyn city"
## t.2011
## "street weiner city east tanton"
## t.2012
## "street city brinker foundation vaughn"
## t.2013
## "church brooklyn queens city thompson"
## t.2014
## "sage city art dance ned"
## t.2015
## "deace street art church city"
# the fifth
topics.over.time[5, ]
## t.2010
## "keefe wetmore people black solondz"
## t.2011
## "keefe man people told asked"
## t.2012
## "people told day man room"
## t.2013
## "people day home told play"
## t.2014
## "dunham people woman day man"
## t.2015
## "people gun shooting police colorado"
# the sixth
topics.over.time[6, ]
## t.2010
## "campaign political brown conservative democratic"
## t.2011
## "republican party voters political campaign"
## t.2012
## "romney women republican campaign voters"
## t.2013
## "republican party campaign voters women"
## t.2014
## "women republican voters democratic election"
## t.2015
## "campaign fiorina republican party candidates"
# the seventh
topics.over.time[7, ]
## t.2010
## "planned parenthood health organization services"
## t.2011
## "planned parenthood health services money"
## t.2012
## "planned parenthood komen health cancer"
## t.2013
## "parenthood health planned cancer breast"
## t.2014
## "planned parenthood health services care"
## t.2015
## "planned parenthood tissue health videos"
# the eighth
topics.over.time[8, ]
## t.2010
## "family godfrey planned years fellowship"
## t.2011
## "school planned law parenthood husband"
## t.2012
## "children school kimbrough family university"
## t.2013
## "planned family mother parenthood died"
## t.2014
## "years planned ny husband children"
## t.2015
## "parenthood planned children years family"
# the ninth
topics.over.time[9, ]
## t.2010
## "bank president military giannoulias money"
## t.2011
## "budget obama cuts federal president"
## t.2012
## "romney obama president administration santorum"
## t.2013
## "obama president administration government tax"
## t.2014
## "insurance obama care pay president"
## t.2015
## "obama president federal people tax"
# the tenth
topics.over.time[10, ]
## t.2010
## "sex education abstinence university institute"
## t.2011
## "sex women education school sexual"
## t.2012
## "women sex students education young"
## t.2013
## "women gilbert sex percent education"
## t.2014
## "women sex school data found"
## t.2015
## "women sex school university writer"
We can represent this relationship visually, as follows:
# with the wordcloud package
topic.num <- 1
num.top.words<-100
topic.top.words <- mallet.top.words(topic.model, topic.words[1,], 100)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F, colors = "red", ordered.colors = T)
num.topics<-10
num.top.words<-25
for(i in 1:num.topics){
topic.top.words <- mallet.top.words(topic.model, topic.words[i,], num.top.words)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F, colors = "red", ordered.colors = T)
}
And creating a cluster dendogram.
# from http://www.cs.princeton.edu/~mimno/R/clustertrees.R
# transpose and normalize the doc topics
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
write.csv(topic.docs, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Results/topic-docs.csv")
# Get a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,],
num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
## [1] "abortion women state law court"
## [2] "life people religious make good"
## [3] "house republican republicans senate bill"
## [4] "street city church brooklyn art"
## [5] "people man day told gun"
## [6] "republican women campaign romney party"
## [7] "planned parenthood health services care"
## [8] "family planned school years parenthood"
## [9] "obama president romney federal government"
## [10] "women sex education school young"
write.csv(topics.labels, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Results/topic-labels.csv")
# create data.frame with columns as docs and rows as topics
topic_docs <- data.frame(topic.docs)
names(topic_docs) <- pp.2010$id
# cluster based on shared words
plot(hclust(dist(topic.words)), labels=topics.labels)
Now we can complete this analysis on a larger subset of the data, from 1982 to the present.
set.seed(12345)
# load the libraries we will need for this section
library(mallet) # a wrapper around the Java machine learning tool MALLET
library(wordcloud) # to visualize wordclouds
# subset the data for 1982 and later, the dates for which we have the complete data
pp.1982 <- subset(pp, pp$year > 1982)
# we first have to create an 'id' column
pp.1982$id <- rownames(pp.1982)
# remove punctuation
pp.1982$TEXT <- gsub(pattern="[[:punct:]]",replacement=" ", pp.1982$TEXT)
# load data into mallet
mallet.instances <- mallet.import(pp.1982$id, pp.1982$TEXT, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Data/stoplist.csv", FALSE, token.regexp="[\\p{L}']+")
# decide what number of topics to model
n.topics = 10
# create a topic trainer object.
topic.model <- MalletLDA(n.topics)
# load our documents
topic.model$loadDocuments(mallet.instances)
# get the vocabulary, and some statistics about word frequencies. these may be useful in further curating the stopword list.
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
# examine some of the vocabulary
word.freqs[1:50,]
## words term.freq doc.freq
## 1 richard 76 63
## 2 schweiker 26 8
## 3 secretary 511 334
## 4 health 5600 1696
## 5 human 1064 603
## 6 services 2391 1105
## 7 today 2099 948
## 8 recommended 111 99
## 9 rule 707 329
## 10 requiring 367 264
## 11 family 3902 1504
## 12 planning 1954 763
## 13 clinics 2772 844
## 14 supported 481 391
## 15 federal 3839 1300
## 16 money 2250 932
## 17 notify 118 90
## 18 parents 1291 529
## 19 minors 151 99
## 20 receive 458 363
## 21 birth 2013 808
## 22 control 2153 968
## 23 pills 560 174
## 24 diaphragms 35 27
## 25 intrauterine 71 53
## 26 devices 155 102
## 27 planned 7738 3773
## 28 parenthood 7559 3807
## 29 federation 1041 803
## 30 america 1607 1096
## 31 threaten 45 45
## 32 teen 1160 295
## 33 agers 592 195
## 34 families 607 376
## 35 immediately 256 233
## 36 moved 476 371
## 37 district 948 515
## 38 court 7542 1201
## 39 block 433 299
## 40 implementation 22 17
## 41 grounds 194 163
## 42 violated 146 118
## 43 statutes 80 53
## 44 constitution 550 232
## 45 guarantee 125 102
## 46 invasion 26 24
## 47 privacy 412 228
## 48 approved 554 338
## 49 office 1472 813
## 50 management 261 189
# the most frequently used words
word.freqs.ordered <- word.freqs[order(-word.freqs$term.freq), ]
head(word.freqs.ordered)
## words term.freq doc.freq
## 98 abortion 16465 1952
## 149 women 10390 2042
## 27 planned 7738 3773
## 28 parenthood 7559 3807
## 38 court 7542 1201
## 560 state 6152 1642
# optimize hyperparameters every 20 iterations, after 50 burn-in iterations.
topic.model$setAlphaOptimization(20, 50)
# now train a model. Note that hyperparameter optimization is on, by default. We can specify the number of iterations. Here we'll use a large-ish round number.
topic.model$train(100)
# get the probability of topics in documents and the probability of words in topics; by default the functions return word counts, so to get the probabilities we can normalize and add smoothing, in order to ensure that nothing has a probability of exactly 0
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
# what are the top words in topic 7?
mallet.top.words(topic.model, topic.words[6,])
## words weights
## 1 planned 0.011511540
## 2 parenthood 0.010029485
## 3 university 0.009953698
## 4 years 0.007747456
## 5 school 0.007419046
## 6 college 0.007048532
## 7 family 0.006922221
## 8 died 0.006610652
## 9 board 0.005945411
## 10 president 0.005886466
# Get a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
## [1] "life people church public religious"
## [2] "street tickets center avenue art"
## [3] "people time year years day"
## [4] "million health administration money united"
## [5] "republican president house obama republicans"
## [6] "planned parenthood university years school"
## [7] "court law justice abortion supreme"
## [8] "abortion abortions planned parenthood women"
## [9] "health family planned sex school"
## [10] "women birth drug control percent"
# show the first few document titles with at least .25 of its content devoted to topic 1
head(pp.1982$TITLE[ doc.topics[1,] > 0.25 ],10)
## [1] U.S. TO REQUIRE NOTICE TO PARENTS IF CHILDREN RECEIVE CONTRACEPTIVES
## [2] STATE SENATE PASSES NEW FINANCIAL-DISCLOSURE BILL
## [3] JOHN P. BLAIR, FOUNDER OF COMPANY WITH VARIED INTERESTS, DIES AT 83
## [4] TOWN HOUSES IN HARLEM ATTRACTING BUYERS
## [5] WESTCHESTER GUIDE
## [6] THE U.S. SHOULD FUND NEITHER LEFT NOR RIGHT
## [7] ART VIEW; A COLLECTION THAT BREATHES THE SPIRIT OF MODERNISM
## [8] THE ORIGIN OF A PLAY
## [9] AROUND THE NATION ; 1982 U.S. Abortion Total Shows Small Decline By United Press International
## [10] POPULATION GROWTH: HOW U.S. POLICY EVOLVED
## 3791 Levels: 'CONSCIENCE' OF CONSERVATIVES GOES ON THE ATTACK ...
# create a vector that has the title of the most representative text for each topic
topics.articles <- rep("", n.topics)
for (i in 1:n.topics) topics.articles[i] <- paste(pp.1982[which.max(doc.topics[i, ]), ]$TITLE)
# weirdly, many of the topics have the same text that is most representative
topics.articles
## [1] "U.S. TO REQUIRE NOTICE TO PARENTS IF CHILDREN RECEIVE CONTRACEPTIVES"
## [2] "BIRTH-CONTROL RULE: CLINICS PONDER EFFECTS"
## [3] "WOMEN SEEK ABORTION LOANS"
## [4] "WOMEN SEEK ABORTION LOANS"
## [5] "U.S. TO REQUIRE NOTICE TO PARENTS IF CHILDREN RECEIVE CONTRACEPTIVES"
## [6] "A BACKDOOR ASSAULT ON FAMILY PLANNING"
## [7] "A LEGISLATIVE BATTLE IN PENNSYLVANIA"
## [8] "A BACKDOOR ASSAULT ON FAMILY PLANNING"
## [9] "A LEGISLATIVE BATTLE IN PENNSYLVANIA"
## [10] "U.S. TO REQUIRE NOTICE TO PARENTS IF CHILDREN RECEIVE CONTRACEPTIVES"
# now let's look at how topics differ across different years?
topic.words.1983 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1983, smoothed=T, normalized=T)
topic.words.1984 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1984, smoothed=T, normalized=T)
topic.words.1985 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1985, smoothed=T, normalized=T)
topic.words.1986 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1986, smoothed=T, normalized=T)
topic.words.1987 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1987, smoothed=T, normalized=T)
topic.words.1988 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1988, smoothed=T, normalized=T)
topic.words.1989 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1989, smoothed=T, normalized=T)
topic.words.1990 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1990, smoothed=T, normalized=T)
topic.words.1991 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1991, smoothed=T, normalized=T)
topic.words.1992 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1992, smoothed=T, normalized=T)
topic.words.1993 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1993, smoothed=T, normalized=T)
topic.words.1994 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1994, smoothed=T, normalized=T)
topic.words.1995 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1995, smoothed=T, normalized=T)
topic.words.1996 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1996, smoothed=T, normalized=T)
topic.words.1997 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1997, smoothed=T, normalized=T)
topic.words.1998 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1998, smoothed=T, normalized=T)
topic.words.1999 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1999, smoothed=T, normalized=T)
topic.words.2000 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2000, smoothed=T, normalized=T)
topic.words.2001 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2001, smoothed=T, normalized=T)
topic.words.2002 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2002, smoothed=T, normalized=T)
topic.words.2003 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2003, smoothed=T, normalized=T)
topic.words.2004 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2004, smoothed=T, normalized=T)
topic.words.2005 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2005, smoothed=T, normalized=T)
topic.words.2006 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2006, smoothed=T, normalized=T)
topic.words.2007 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2007, smoothed=T, normalized=T)
topic.words.2008 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2008, smoothed=T, normalized=T)
topic.words.2009 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2009, smoothed=T, normalized=T)
topic.words.2010 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2010, smoothed=T, normalized=T)
topic.words.2011 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2011, smoothed=T, normalized=T)
topic.words.2012 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2012, smoothed=T, normalized=T)
topic.words.2013 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2013, smoothed=T, normalized=T)
topic.words.2014 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2014, smoothed=T, normalized=T)
topic.words.2015 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2015, smoothed=T, normalized=T)
topics.labels.1983 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1983[topic] <- paste(mallet.top.words(topic.model, topic.words.1983[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1984 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1984[topic] <- paste(mallet.top.words(topic.model, topic.words.1984[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1985 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1985[topic] <- paste(mallet.top.words(topic.model, topic.words.1985[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1986 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1986[topic] <- paste(mallet.top.words(topic.model, topic.words.1986[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1987 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1987[topic] <- paste(mallet.top.words(topic.model, topic.words.1987[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1988 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1988[topic] <- paste(mallet.top.words(topic.model, topic.words.1988[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1989 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1989[topic] <- paste(mallet.top.words(topic.model, topic.words.1989[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1990 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1990[topic] <- paste(mallet.top.words(topic.model, topic.words.1990[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1991 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1991[topic] <- paste(mallet.top.words(topic.model, topic.words.1991[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1992 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1992[topic] <- paste(mallet.top.words(topic.model, topic.words.1992[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1993 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1993[topic] <- paste(mallet.top.words(topic.model, topic.words.1993[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1994 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1994[topic] <- paste(mallet.top.words(topic.model, topic.words.1994[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1995 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1995[topic] <- paste(mallet.top.words(topic.model, topic.words.1995[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1996 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1996[topic] <- paste(mallet.top.words(topic.model, topic.words.1996[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1997 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1997[topic] <- paste(mallet.top.words(topic.model, topic.words.1997[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1998 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1998[topic] <- paste(mallet.top.words(topic.model, topic.words.1998[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1999 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1999[topic] <- paste(mallet.top.words(topic.model, topic.words.1999[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2000 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2000[topic] <- paste(mallet.top.words(topic.model, topic.words.2000[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2001 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2001[topic] <- paste(mallet.top.words(topic.model, topic.words.2001[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2002 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2002[topic] <- paste(mallet.top.words(topic.model, topic.words.2002[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2003 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2003[topic] <- paste(mallet.top.words(topic.model, topic.words.2003[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2004 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2004[topic] <- paste(mallet.top.words(topic.model, topic.words.2004[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2005 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2005[topic] <- paste(mallet.top.words(topic.model, topic.words.2005[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2006 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2006[topic] <- paste(mallet.top.words(topic.model, topic.words.2006[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2007 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2007[topic] <- paste(mallet.top.words(topic.model, topic.words.2007[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2008 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2008[topic] <- paste(mallet.top.words(topic.model, topic.words.2008[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2009 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2009[topic] <- paste(mallet.top.words(topic.model, topic.words.2009[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2010 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2010[topic] <- paste(mallet.top.words(topic.model, topic.words.2010[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2011 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2011[topic] <- paste(mallet.top.words(topic.model, topic.words.2011[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2012 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2012[topic] <- paste(mallet.top.words(topic.model, topic.words.2012[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2013 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2013[topic] <- paste(mallet.top.words(topic.model, topic.words.2013[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2014 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2014[topic] <- paste(mallet.top.words(topic.model, topic.words.2014[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2015 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2015[topic] <- paste(mallet.top.words(topic.model, topic.words.2015[topic,], num.top.words=5)$words, collapse=" ")
# vectorize them
t.1983 <- as.vector(topics.labels.1983)
t.1984 <- as.vector(topics.labels.1984)
t.1985 <- as.vector(topics.labels.1985)
t.1986 <- as.vector(topics.labels.1986)
t.1987 <- as.vector(topics.labels.1987)
t.1988 <- as.vector(topics.labels.1988)
t.1989 <- as.vector(topics.labels.1989)
t.1990 <- as.vector(topics.labels.1990)
t.1991 <- as.vector(topics.labels.1991)
t.1992 <- as.vector(topics.labels.1992)
t.1993 <- as.vector(topics.labels.1993)
t.1994 <- as.vector(topics.labels.1994)
t.1995 <- as.vector(topics.labels.1995)
t.1996 <- as.vector(topics.labels.1996)
t.1997 <- as.vector(topics.labels.1997)
t.1998 <- as.vector(topics.labels.1998)
t.1999 <- as.vector(topics.labels.1999)
t.2001 <- as.vector(topics.labels.2001)
t.2002 <- as.vector(topics.labels.2002)
t.2003 <- as.vector(topics.labels.2003)
t.2004 <- as.vector(topics.labels.2004)
t.2005 <- as.vector(topics.labels.2005)
t.2000 <- as.vector(topics.labels.2000)
t.2001 <- as.vector(topics.labels.2001)
t.2002 <- as.vector(topics.labels.2002)
t.2003 <- as.vector(topics.labels.2003)
t.2004 <- as.vector(topics.labels.2004)
t.2005 <- as.vector(topics.labels.2005)
t.2006 <- as.vector(topics.labels.2006)
t.2007 <- as.vector(topics.labels.2007)
t.2008 <- as.vector(topics.labels.2008)
t.2009 <- as.vector(topics.labels.2009)
t.2010 <- as.vector(topics.labels.2010)
t.2011 <- as.vector(topics.labels.2011)
t.2012 <- as.vector(topics.labels.2012)
t.2013 <- as.vector(topics.labels.2013)
t.2014 <- as.vector(topics.labels.2014)
t.2015 <- as.vector(topics.labels.2015)
# create a matrix with all the topics over time
topics.over.time <- cbind(t.1983, t.1984, t.1985, t.1986, t.1987, t.1988, t.1989, t.1990, t.1991, t.1992, t.1993, t.1994, t.1995, t.1996, t.1997, t.1998, t.1999, t.2000, t.2001, t.2002, t.2003, t.2004, t.2005, t.2006, t.2007, t.2008, t.2009, t.2010, t.2011, t.2012, t.2013, t.2014, t.2015)
Now we can look at how the topics have changed over the years, to see if Planned Parenthood has become a more politicized issue over time, or perhaps during certain election cycles.
# look at each topic individually -- the first topic over the years
topics.over.time[1, ]
## t.1983
## "life human role movement people"
## t.1984
## "church life catholic ferraro political"
## t.1985
## "church people bishop life public"
## t.1986
## "catholic church people life oriented"
## t.1987
## "life people world church religious"
## t.1988
## "church churches black bishop robertson"
## t.1989
## "life people fulghum wattleton church"
## t.1990
## "schroeder people life church issues"
## t.1991
## "church people catholic members life"
## t.1992
## "people ireland life public rights"
## t.1993
## "louise mero suicide life church"
## t.1994
## "people life black suicide vatican"
## t.1995
## "catholic church people life political"
## t.1996
## "gay life religious people church"
## t.1997
## "suicide assisted life people political"
## t.1998
## "ross book life people bishop"
## t.1999
## "life movement book people personal"
## t.2000
## "gore church people catholic life"
## t.2001
## "pedreira religious people baptist public"
## t.2002
## "people life ideas social ethnic"
## t.2003
## "people sex life moral personal"
## t.2004
## "church american public life people"
## t.2005
## "people life church conservative faith"
## t.2006
## "life war church power people"
## t.2007
## "life liberal people political marriage"
## t.2008
## "religious book people church life"
## t.2009
## "life people conservative church liberal"
## t.2010
## "black wetmore life nietzsche conservative"
## t.2011
## "life social public people church"
## t.2012
## "religious people life catholic social"
## t.2013
## "gay life marriage public people"
## t.2014
## "dunham book life social religious"
## t.2015
## "life people pope public opinion"
# the second!
topics.over.time[2, ]
## t.1983
## "harlem street city shop avenue"
## t.1984
## "center art sale arts tremaine"
## t.1985
## "papp street theater festival park"
## t.1986
## "street thrift avenue shop east"
## t.1987
## "street east manhattan artists city"
## t.1988
## "graffiti city street assemblyman manhattan"
## t.1989
## "street tickets manhattan park theater"
## t.1990
## "museum steichen street river house"
## t.1991
## "street tickets benefit dinner avenue"
## t.1992
## "tickets street benefit party fashion"
## t.1993
## "tickets street rock benefit music"
## t.1994
## "tickets street benefit dinner avenue"
## t.1995
## "street tickets kitchen dinner art"
## t.1996
## "kassindja patriarch hall miss kpalime"
## t.1997
## "center art museum street city"
## t.1998
## "tickets street hours avenue july"
## t.1999
## "tickets benefit street june dinner"
## t.2000
## "island raven fund barbash harbor"
## t.2001
## "island street today center film"
## t.2002
## "scheide music street library park"
## t.2003
## "guttenberg film street documentary west"
## t.2004
## "cooke street cooking dinner hamptons"
## t.2005
## "club village city zoning malloy"
## t.2006
## "artists street art east manhattan"
## t.2007
## "street hours road center hyder"
## t.2008
## "fossella stewart hammer hall island"
## t.2009
## "street songs petrusich east music"
## t.2010
## "solondz street art de greene"
## t.2011
## "street weiner east music petrusich"
## t.2012
## "street city music brooklyn center"
## t.2013
## "church brooklyn queens thompson attends"
## t.2014
## "art dance street film goldwyn"
## t.2015
## "street art show west friday"
# the third
topics.over.time[3, ]
## t.1983 t.1984
## "town year years good time" "years year day time told"
## t.1985 t.1986
## "bours time people home year" "time people police bomb years"
## t.1987 t.1988
## "malvasi people fire time police" "people time year volunteers day"
## t.1989 t.1990
## "people time day years back" "people day years time year"
## t.1991 t.1992
## "home people police time years" "people time year years day"
## t.1993 t.1994
## "time people years mother day" "people year time job office"
## t.1995 t.1996
## "people year police years time" "people time men year years"
## t.1997 t.1998
## "people year time work men" "time people years mccaughey year"
## t.1999 t.2000
## "time kopp people year years" "nytimes people year time years"
## t.2001 t.2002
## "anthrax people back kopp work" "police people nelson day fire"
## t.2003 t.2004
## "people kopp years home time" "year people time child girl"
## t.2005 t.2006
## "people man time years told" "time people years year home"
## t.2007 t.2008
## "people told time year home" "shirt people years year job"
## t.2009 t.2010
## "ivins anthrax case years people" "keefe time years year day"
## t.2011 t.2012
## "people keefe told man time" "people told time day years"
## t.2013 t.2014
## "gilbert people time year day" "people time year work day"
## t.2015
## "people time shooting gun police"
# the fourth
topics.over.time[4, ]
## t.1983
## "federal health reagan administration government"
## t.1984
## "population reagan united policy administration"
## t.1985
## "administration million population united groups"
## t.1986
## "million united company states advertising"
## t.1987
## "administration million federal reagan government"
## t.1988
## "million population united administration reagan"
## t.1989
## "groups million united states president"
## t.1990
## "company million united president companies"
## t.1991
## "million administration government fees federal"
## t.1992
## "united million president states administration"
## t.1993
## "administration health president united population"
## t.1994
## "million population health groups insurance"
## t.1995
## "administration president money health plan"
## t.1996
## "million united company american group"
## t.1997
## "money million carey federal council"
## t.1998
## "million company groups fidelity johnson"
## t.1999
## "million company money united companies"
## t.2000
## "million states companies company corzine"
## t.2001
## "groups federal administration money group"
## t.2002
## "administration agency health officials percent"
## t.2003
## "groups million company plan software"
## t.2004
## "administration health federal united agency"
## t.2005
## "agency administration groups president officials"
## t.2006
## "plan million administration president health"
## t.2007
## "million money funds percent president"
## t.2008
## "health million president administration groups"
## t.2009
## "insurance health coverage federal money"
## t.2010
## "million bank money health federal"
## t.2011
## "federal government health money administration"
## t.2012
## "health administration president million insurance"
## t.2013
## "health million administration year money"
## t.2014
## "million insurance health coverage year"
## t.2015
## "government states health money federal"
# the fifth
topics.over.time[5, ]
## t.1983
## "republican vote president senate congress"
## t.1984
## "president house republican political issues"
## t.1985
## "senator republican campaign president house"
## t.1986
## "state senate campaign house committee"
## t.1987
## "republican house president issues campaign"
## t.1988
## "bush state president republican senator"
## t.1989
## "president governor republican political state"
## t.1990
## "bill senate governor president house"
## t.1991
## "house republican vote representative bush"
## t.1992
## "bush republican president campaign state"
## t.1993
## "clinton president campaign bill republican"
## t.1994
## "house clinton republican president senator"
## t.1995
## "house bill republican nomination republicans"
## t.1996
## "republican campaign clinton president bill"
## t.1997
## "clinton president house republican vote"
## t.1998
## "bill republican campaign pataki house"
## t.1999
## "governor bill republican bush clinton"
## t.2000
## "bush campaign republican president democratic"
## t.2001
## "mcgreevey bush schundler republican president"
## t.2002
## "forrester bill republican democrats house"
## t.2003
## "senate republican bush bill president"
## t.2004
## "bush president house democratic party"
## t.2005
## "senator republican democrats santorum president"
## t.2006
## "republican president senator senate bush"
## t.2007
## "republican bill obama campaign giuliani"
## t.2008
## "obama republican party senator political"
## t.2009
## "obama house senate president bill"
## t.2010
## "republican campaign president senator mcmahon"
## t.2011
## "house republican republicans democrats obama"
## t.2012
## "romney obama republican president campaign"
## t.2013
## "republican state party campaign bill"
## t.2014
## "republican state republicans democrats democratic"
## t.2015
## "republican republicans trump house obama"
# the sixth
topics.over.time[6, ]
## t.1983
## "university chairman planned school college"
## t.1984
## "cousins university rockefeller planned parenthood"
## t.1985
## "university planned college school director"
## t.1986
## "college canfield university planned years"
## t.1987
## "president planned years university school"
## t.1988
## "planned president hepburn university city"
## t.1989
## "planned university school years parenthood"
## t.1990
## "university planned menninger years parenthood"
## t.1991
## "university college president years planned"
## t.1992
## "university president college planned dallas"
## t.1993
## "daughter university planned president father"
## t.1994
## "college school years planned university"
## t.1995
## "planned university college parenthood father"
## t.1996
## "husband university planned daughter parenthood"
## t.1997
## "planned parenthood family board norman"
## t.1998
## "university planned parenthood college board"
## t.1999
## "planned parenthood university died nytimes"
## t.2000
## "dyson planned university parenthood nytimes"
## t.2001
## "planned parenthood family board president"
## t.2002
## "board planned parenthood university wife"
## t.2003
## "leon levy planned parenthood board"
## t.2004
## "parenthood planned ny died school"
## t.2005
## "board planned rabbi school family"
## t.2006
## "died school university years children"
## t.2007
## "university planned years parenthood college"
## t.2008
## "mott planned parenthood years esther"
## t.2009
## "years school family university director"
## t.2010
## "university family planned parenthood life"
## t.2011
## "planned parenthood school law husband"
## t.2012
## "planned parenthood university school college"
## t.2013
## "planned parenthood family college years"
## t.2014
## "planned sage ny parenthood years"
## t.2015
## "planned parenthood university years family"
# the seventh
topics.over.time[7, ]
## t.1983
## "court judge decision rule abortion"
## t.1984
## "court law abortion washburn case"
## t.1985
## "court state supreme law decision"
## t.1986
## "court abortion state roe decision"
## t.1987
## "court law judge bork supreme"
## t.1988
## "court abortion federal judge decision"
## t.1989
## "court abortion roe state supreme"
## t.1990
## "court judge souter abortion supreme"
## t.1991
## "court law abortion supreme case"
## t.1992
## "court justice abortion law case"
## t.1993
## "court law abortion justice supreme"
## t.1994
## "court souter law justice judge"
## t.1995
## "court law case abortion justice"
## t.1996
## "court rehnquist law justice case"
## t.1997
## "court state supreme decision law"
## t.1998
## "court law judge supreme abortion"
## t.1999
## "court justice law blackmun case"
## t.2000
## "court abortion law nebraska state"
## t.2001
## "court connor law ashcroft abortion"
## t.2002
## "court case supreme law judge"
## t.2003
## "court law bowers justice decision"
## t.2004
## "justice court law blackmun abortion"
## t.2005
## "court judge justice law abortion"
## t.2006
## "court justice judge law alito"
## t.2007
## "court stevens justice law abortion"
## t.2008
## "law court judge justice state"
## t.2009
## "court justice judge law abortion"
## t.2010
## "court law supreme case thomas"
## t.2011
## "law court state federal abortion"
## t.2012
## "court law justice state case"
## t.2013
## "court law supreme justice state"
## t.2014
## "court law supreme state justice"
## t.2015
## "court law case supreme justice"
# the eighth
topics.over.time[8, ]
## t.1983
## "abortion planned abortions parenthood life"
## t.1984
## "abortion abortions planned clinics women"
## t.1985
## "abortion clinic clinics abortions planned"
## t.1986
## "abortion abortions women parenthood planned"
## t.1987
## "abortion abortions planned parenthood clinics"
## t.1988
## "abortion abortions women planned parenthood"
## t.1989
## "abortion abortions rights women planned"
## t.1990
## "abortion abortions planned parenthood women"
## t.1991
## "abortion clinics abortions parenthood clinic"
## t.1992
## "abortion abortions women clinics rights"
## t.1993
## "abortion abortions clinic clinics rights"
## t.1994
## "abortion clinic parenthood planned clinics"
## t.1995
## "abortion clinic abortions clinics parenthood"
## t.1996
## "abortion abortions parenthood planned salvi"
## t.1997
## "abortion abortions parenthood planned rights"
## t.1998
## "abortion abortions clinic anti doctors"
## t.1999
## "abortion abortions doctors rights clinic"
## t.2000
## "abortion abortions mifepristone doctors planned"
## t.2001
## "abortion clinics planned parenthood federal"
## t.2002
## "abortion planned parenthood clinic abortions"
## t.2003
## "abortion abortions rights planned parenthood"
## t.2004
## "abortion records planned rights parenthood"
## t.2005
## "abortion planned parenthood abortions rights"
## t.2006
## "abortion abortions planned parenthood rights"
## t.2007
## "abortion abortions women parenthood planned"
## t.2008
## "abortion grand planned parenthood abortions"
## t.2009
## "abortion tiller abortions clinic women"
## t.2010
## "abortion abortions planned parenthood doctors"
## t.2011
## "abortion abortions planned parenthood state"
## t.2012
## "abortion parenthood planned abortions women"
## t.2013
## "abortion abortions women texas clinics"
## t.2014
## "abortion clinics women abortions clinic"
## t.2015
## "abortion parenthood planned tissue fetal"
# the ninth
topics.over.time[9, ]
## t.1983
## "family parents health planned planning"
## t.1984
## "family planning county programs services"
## t.1985
## "family teen planning program services"
## t.1986
## "teen agers school education family"
## t.1987
## "family teen planned school planning"
## t.1988
## "family planning health services program"
## t.1989
## "teen parents aids year health"
## t.1990
## "aids teen health family planned"
## t.1991
## "family planning health program teen"
## t.1992
## "family aids health services planned"
## t.1993
## "school teen program care family"
## t.1994
## "school teen planned family health"
## t.1995
## "health care foster services planned"
## t.1996
## "aids teen family sex children"
## t.1997
## "family care planning planned parenthood"
## t.1998
## "family health services calderone planning"
## t.1999
## "health care school condom education"
## t.2000
## "sex parents health school education"
## t.2001
## "family planned programs education services"
## t.2002
## "health services family care city"
## t.2003
## "sex sexual health school family"
## t.2004
## "sex education alberto jasmine health"
## t.2005
## "sex family parents education abstinence"
## t.2006
## "family sex sexual parents health"
## t.2007
## "sex health education planned abstinence"
## t.2008
## "health care school age planned"
## t.2009
## "health family sex care education"
## t.2010
## "education sex health abstinence family"
## t.2011
## "sex health services family parenthood"
## t.2012
## "health planned care parenthood family"
## t.2013
## "health family care parenthood planning"
## t.2014
## "health care planned sex family"
## t.2015
## "health planned parenthood family services"
# the tenth
topics.over.time[10, ]
## t.1983
## "women drug sponge contraceptive birth"
## t.1984
## "women birth infant study sperm"
## t.1985
## "women control birth percent men"
## t.1986
## "women birth percent genetic iud"
## t.1987
## "women birth control percent woman"
## t.1988
## "women drug control iud birth"
## t.1989
## "women percent woman drug pill"
## t.1990
## "women drug pill birth control"
## t.1991
## "women norplant birth woman medical"
## t.1992
## "women drug pregnancy norplant birth"
## t.1993
## "women pill control doctor birth"
## t.1994
## "women ru drug percent control"
## t.1995
## "women birth control percent woman"
## t.1996
## "women birth men percent cancer"
## t.1997
## "women pregnancy contraception drug birth"
## t.1998
## "women birth percent woman control"
## t.1999
## "women drug doctors research sponge"
## t.2000
## "women drug pill birth pregnancy"
## t.2001
## "women safe baby birth contraceptive"
## t.2002
## "women cancer pill research control"
## t.2003
## "women drug birth pills control"
## t.2004
## "women drug birth pregnancy pill"
## t.2005
## "women drug pill birth morning"
## t.2006
## "women drug pill birth contraception"
## t.2007
## "women birth research percent woman"
## t.2008
## "women study control risk lifers"
## t.2009
## "women pills study drug dominican"
## t.2010
## "women birth pregnancy control contraception"
## t.2011
## "women birth control percent tanton"
## t.2012
## "women komen contraception birth cancer"
## t.2013
## "women cancer breast percent drug"
## t.2014
## "women woman gomperts birth control"
## t.2015
## "women research control woman percent"
The ninth topic is especially interesting—it appears to track scandals or politicized issues that Planned Parenthood is embroiled in. To take a peek at how that has changed over the years, we can see that in 1985, the most common words in the topic were “bours public office called investigation.” In 1993, they were “death suicide public told office.” In 1999, they were “kopp smith web death site.” In 2006, they were “death kline group found called.” In 2012, they were “told video case web kimbrough.” And in 2015, they were “tissue fetal video planned people.” We can also trace changes in the topic that’s about the Supreme Court, in the topic that’s about election, in the topic that’s about sex education—these prove to be very informative topics from which we can build interesting additional research questions!
We can also represent this topics visually, as follows:
# with the wordcloud package
topic.num <- 1
num.top.words<-100
topic.top.words <- mallet.top.words(topic.model, topic.words[1,], 100)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F, colors = "red", ordered.colors = T)
num.topics<-10
num.top.words<-25
for(i in 1:num.topics){
topic.top.words <- mallet.top.words(topic.model, topic.words[i,], num.top.words)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F, colors = "red", ordered.colors = T)
}
And create a cluster dendogram.
# from http://www.cs.princeton.edu/~mimno/R/clustertrees.R
# transpose and normalize the doc topics
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
write.csv(topic.docs, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Results/topic-docs2.csv")
# Get a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,],
num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
## [1] "life people church public religious"
## [2] "street tickets center avenue art"
## [3] "people time year years day"
## [4] "million health administration money united"
## [5] "republican president house obama republicans"
## [6] "planned parenthood university years school"
## [7] "court law justice abortion supreme"
## [8] "abortion abortions planned parenthood women"
## [9] "health family planned sex school"
## [10] "women birth drug control percent"
write.csv(topics.labels, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Results/topic-labels2.csv")
# create data.frame with columns as docs and rows as topics
topic_docs <- data.frame(topic.docs)
names(topic_docs) <- pp.1982$id
# cluster based on shared words
png('Graphic_PPDendogram.png')
plot(hclust(dist(topic.words)), labels=topics.labels)
dev.off()
## quartz_off_screen
## 2